
<!DOCTYPE html>

<html lang="en">
  <head>
    <meta charset="utf-8" />
    <meta name="viewport" content="width=device-width, initial-scale=1.0" /><meta name="generator" content="Docutils 0.18.1: http://docutils.sourceforge.net/" />

    <title>第五章 变形 &#8212; Joyful Pandas 1.0 documentation</title>
<script>
  document.documentElement.dataset.mode = localStorage.getItem("mode") || "";
  document.documentElement.dataset.theme = localStorage.getItem("theme") || "light"
</script>

  <!-- Loaded before other Sphinx assets -->
  <link href="../_static/styles/theme.css?digest=92025949c220c2e29695" rel="stylesheet">
<link href="../_static/styles/pydata-sphinx-theme.css?digest=92025949c220c2e29695" rel="stylesheet">


  <link rel="stylesheet"
    href="../_static/vendor/fontawesome/5.13.0/css/all.min.css">
  <link rel="preload" as="font" type="font/woff2" crossorigin
    href="../_static/vendor/fontawesome/5.13.0/webfonts/fa-solid-900.woff2">
  <link rel="preload" as="font" type="font/woff2" crossorigin
    href="../_static/vendor/fontawesome/5.13.0/webfonts/fa-brands-400.woff2">

    <link rel="stylesheet" type="text/css" href="../_static/pygments.css" />
    <link rel="stylesheet" type="text/css" href="../_static/plot_directive.css" />
    <link rel="stylesheet" type="text/css" href="../_static/css/s4defs-roles.css" />

  <!-- Pre-loaded scripts that we'll load fully later -->
  <link rel="preload" as="script" href="../_static/scripts/pydata-sphinx-theme.js?digest=92025949c220c2e29695">

    <script data-url_root="../" id="documentation_options" src="../_static/documentation_options.js"></script>
    <script src="../_static/jquery.js"></script>
    <script src="../_static/underscore.js"></script>
    <script src="../_static/_sphinx_javascript_frameworks_compat.js"></script>
    <script src="../_static/doctools.js"></script>
    <link rel="index" title="Index" href="../genindex.html" />
    <link rel="search" title="Search" href="../search.html" />
    <link rel="next" title="第六章 连接" href="ch6.html" />
    <link rel="prev" title="第四章 分组" href="ch4.html" />
<meta name="viewport" content="width=device-width, initial-scale=1" />
<meta name="docsearch:language" content="en">
  </head>
  
  
  <body data-spy="scroll" data-target="#bd-toc-nav" data-offset="180" data-default-mode="">
    <div class="bd-header-announcement container-fluid" id="banner">
      

    </div>

    
    <nav class="bd-header navbar navbar-light navbar-expand-lg bg-light fixed-top bd-navbar" id="navbar-main"><div class="bd-header__inner container-xl">

  <div id="navbar-start">
    
    
  


<a class="navbar-brand logo" href="../index.html">
  
  
  
  
    <img src="../_static/finallogo1.svg" class="logo__image only-light" alt="Logo image">
    <img src="../_static/finallogo1.svg" class="logo__image only-dark" alt="Logo image">
  
  
</a>
    
  </div>

  <button class="navbar-toggler" type="button" data-toggle="collapse" data-target="#navbar-collapsible" aria-controls="navbar-collapsible" aria-expanded="false" aria-label="Toggle navigation">
    <span class="fas fa-bars"></span>
  </button>

  
  <div id="navbar-collapsible" class="col-lg-9 collapse navbar-collapse">
    <div id="navbar-center" class="mr-auto">
      
      <div class="navbar-center-item">
        <ul id="navbar-main-elements" class="navbar-nav">
    <li class="toctree-l1 nav-item">
 <a class="reference internal nav-link" href="../Home.html">
  Home
 </a>
</li>

<li class="toctree-l1 current active nav-item">
 <a class="reference internal nav-link" href="index.html">
  Content
 </a>
</li>

<li class="toctree-l1 nav-item">
 <a class="reference internal nav-link" href="../Author.html">
  Author
 </a>
</li>

<li class="toctree-l1 nav-item">
 <a class="reference internal nav-link" href="../Datawhale.html">
  Datawhale
 </a>
</li>

<li class="toctree-l1 nav-item">
 <a class="reference internal nav-link" href="../pandas%E6%95%B0%E6%8D%AE%E5%A4%84%E7%90%86%E4%B8%8E%E5%88%86%E6%9E%90.html">
  pandas数据处理与分析
 </a>
</li>

<li class="toctree-l1 nav-item">
 <a class="reference internal nav-link" href="../%E8%A1%A5%E5%85%85%E4%B9%A0%E9%A2%98.html">
  补充习题
 </a>
</li>

    
    <li class="nav-item">
        <a class="nav-link nav-external" href="https://pandas.pydata.org/docs/index.html">Doc<i class="fas fa-external-link-alt"></i></a>
    </li>
    
</ul>
      </div>
      
    </div>

    <div id="navbar-end">
      
      <div class="navbar-end-item">
        <span id="theme-switch" class="btn btn-sm btn-outline-primary navbar-btn rounded-circle">
    <a class="theme-switch" data-mode="light"><i class="fas fa-sun"></i></a>
    <a class="theme-switch" data-mode="dark"><i class="far fa-moon"></i></a>
    <a class="theme-switch" data-mode="auto"><i class="fas fa-adjust"></i></a>
</span>
      </div>
      
      <div class="navbar-end-item">
        <ul id="navbar-icon-links" class="navbar-nav" aria-label="Icon Links">
        <li class="nav-item">
          <a class="nav-link" href="https://github.com/datawhalechina/joyful-pandas" rel="noopener" target="_blank" title="GitHub"><span><i class="fab fa-github-square"></i></span>
            <label class="sr-only">GitHub</label></a>
        </li>
      </ul>
      </div>
      
    </div>
  </div>
</div>
    </nav>
    

    <div class="bd-container container-xl">
      <div class="bd-container__inner row">
          

<!-- Only show if we have sidebars configured, else just a small margin  -->
<div class="bd-sidebar-primary col-12 col-md-3 bd-sidebar">
  <div class="sidebar-start-items"><form class="bd-search d-flex align-items-center" action="../search.html" method="get">
  <i class="icon fas fa-search"></i>
  <input type="search" class="form-control" name="q" id="search-input" placeholder="Search the docs ..." aria-label="Search the docs ..." autocomplete="off" >
</form><nav class="bd-links" id="bd-docs-nav" aria-label="Main navigation">
  <div class="bd-toc-item active">
    <ul class="current nav bd-sidenav">
 <li class="toctree-l1">
  <a class="reference internal" href="ch1.html">
   第一章 预备知识
  </a>
 </li>
 <li class="toctree-l1">
  <a class="reference internal" href="ch2.html">
   第二章 pandas基础
  </a>
 </li>
 <li class="toctree-l1">
  <a class="reference internal" href="ch3.html">
   第三章 索引
  </a>
 </li>
 <li class="toctree-l1">
  <a class="reference internal" href="ch4.html">
   第四章 分组
  </a>
 </li>
 <li class="toctree-l1 current active">
  <a class="current reference internal" href="#">
   第五章 变形
  </a>
 </li>
 <li class="toctree-l1">
  <a class="reference internal" href="ch6.html">
   第六章 连接
  </a>
 </li>
 <li class="toctree-l1">
  <a class="reference internal" href="ch7.html">
   第七章 缺失数据
  </a>
 </li>
 <li class="toctree-l1">
  <a class="reference internal" href="ch8.html">
   第八章 文本数据
  </a>
 </li>
 <li class="toctree-l1">
  <a class="reference internal" href="ch9.html">
   第九章 分类数据
  </a>
 </li>
 <li class="toctree-l1">
  <a class="reference internal" href="ch10.html">
   第十章 时序数据
  </a>
 </li>
 <li class="toctree-l1">
  <a class="reference internal" href="%E5%8F%82%E8%80%83%E7%AD%94%E6%A1%88.html">
   参考答案
  </a>
 </li>
</ul>

  </div>
</nav>
  </div>
  <div class="sidebar-end-items">
  </div>
</div>


          


<div class="bd-sidebar-secondary d-none d-xl-block col-xl-2 bd-toc">
  
    
    <div class="toc-item">
      
<div class="tocsection onthispage mt-5 pt-1 pb-3">
    <i class="fas fa-list"></i> On this page
</div>

<nav id="bd-toc-nav">
    <ul class="visible nav section-nav flex-column">
 <li class="toc-h2 nav-item toc-entry">
  <a class="reference internal nav-link" href="#id2">
   一、长宽表的变形
  </a>
  <ul class="nav section-nav flex-column">
   <li class="toc-h3 nav-item toc-entry">
    <a class="reference internal nav-link" href="#pivot">
     1. pivot
    </a>
   </li>
   <li class="toc-h3 nav-item toc-entry">
    <a class="reference internal nav-link" href="#pivot-table">
     2. pivot_table
    </a>
   </li>
   <li class="toc-h3 nav-item toc-entry">
    <a class="reference internal nav-link" href="#melt">
     3. melt
    </a>
   </li>
   <li class="toc-h3 nav-item toc-entry">
    <a class="reference internal nav-link" href="#wide-to-long">
     4. wide_to_long
    </a>
   </li>
  </ul>
 </li>
 <li class="toc-h2 nav-item toc-entry">
  <a class="reference internal nav-link" href="#id3">
   二、索引的变形
  </a>
  <ul class="nav section-nav flex-column">
   <li class="toc-h3 nav-item toc-entry">
    <a class="reference internal nav-link" href="#stackunstack">
     1. stack与unstack
    </a>
   </li>
   <li class="toc-h3 nav-item toc-entry">
    <a class="reference internal nav-link" href="#id4">
     2. 聚合与变形的关系
    </a>
   </li>
  </ul>
 </li>
 <li class="toc-h2 nav-item toc-entry">
  <a class="reference internal nav-link" href="#id5">
   三、其他变形函数
  </a>
  <ul class="nav section-nav flex-column">
   <li class="toc-h3 nav-item toc-entry">
    <a class="reference internal nav-link" href="#crosstab">
     1. crosstab
    </a>
   </li>
   <li class="toc-h3 nav-item toc-entry">
    <a class="reference internal nav-link" href="#explode">
     2. explode
    </a>
   </li>
   <li class="toc-h3 nav-item toc-entry">
    <a class="reference internal nav-link" href="#get-dummies">
     3. get_dummies
    </a>
   </li>
  </ul>
 </li>
 <li class="toc-h2 nav-item toc-entry">
  <a class="reference internal nav-link" href="#id6">
   四、练习
  </a>
  <ul class="nav section-nav flex-column">
   <li class="toc-h3 nav-item toc-entry">
    <a class="reference internal nav-link" href="#ex1">
     Ex1：美国非法药物数据集
    </a>
   </li>
   <li class="toc-h3 nav-item toc-entry">
    <a class="reference internal nav-link" href="#ex2-wide-to-long">
     Ex2：特殊的wide_to_long方法
    </a>
   </li>
  </ul>
 </li>
</ul>

</nav>
    </div>
    
    <div class="toc-item">
      
    </div>
    
  
</div>


          
          
          <div class="bd-content col-12 col-md-9 col-xl-7">
              
              <article class="bd-article" role="main">
                
  <section id="id1">
<h1>第五章 变形<a class="headerlink" href="#id1" title="Permalink to this heading">#</a></h1>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [1]: </span><span class="kn">import</span> <span class="nn">numpy</span> <span class="k">as</span> <span class="nn">np</span>

<span class="gp">In [2]: </span><span class="kn">import</span> <span class="nn">pandas</span> <span class="k">as</span> <span class="nn">pd</span>
</pre></div>
</div>
<section id="id2">
<h2>一、长宽表的变形<a class="headerlink" href="#id2" title="Permalink to this heading">#</a></h2>
<p>什么是长表？什么是宽表？这个概念是对于某一个特征而言的。例如：一个表中把性别存储在某一个列中，那么它就是关于性别的长表；如果把性别作为列名，列中的元素是某一其他的相关特征数值，那么这个表是关于性别的宽表。下面的两张表就分别是关于性别的长表和宽表：</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [3]: </span><span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span><span class="s1">&#39;Gender&#39;</span><span class="p">:[</span><span class="s1">&#39;F&#39;</span><span class="p">,</span><span class="s1">&#39;F&#39;</span><span class="p">,</span><span class="s1">&#39;M&#39;</span><span class="p">,</span><span class="s1">&#39;M&#39;</span><span class="p">],</span>
<span class="gp">   ...: </span>              <span class="s1">&#39;Height&#39;</span><span class="p">:[</span><span class="mi">163</span><span class="p">,</span> <span class="mi">160</span><span class="p">,</span> <span class="mi">175</span><span class="p">,</span> <span class="mi">180</span><span class="p">]})</span>
<span class="gp">   ...: </span>
<span class="gh">Out[3]: </span>
<span class="go">  Gender  Height</span>
<span class="go">0      F     163</span>
<span class="go">1      F     160</span>
<span class="go">2      M     175</span>
<span class="go">3      M     180</span>

<span class="gp">In [4]: </span><span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span><span class="s1">&#39;Height: F&#39;</span><span class="p">:[</span><span class="mi">163</span><span class="p">,</span> <span class="mi">160</span><span class="p">],</span>
<span class="gp">   ...: </span>              <span class="s1">&#39;Height: M&#39;</span><span class="p">:[</span><span class="mi">175</span><span class="p">,</span> <span class="mi">180</span><span class="p">]})</span>
<span class="gp">   ...: </span>
<span class="gh">Out[4]: </span>
<span class="go">   Height: F  Height: M</span>
<span class="go">0        163        175</span>
<span class="go">1        160        180</span>
</pre></div>
</div>
<p>显然这两张表从信息上是完全等价的，它们包含相同的身高统计数值，只是这些数值的呈现方式不同，而其呈现方式主要又与性别一列选择的布局模式有关，即到底是以 <span class="red">long</span> 的状态存储还是以 <span class="red">wide</span> 的状态存储。因此， <code class="docutils literal notranslate"><span class="pre">pandas</span></code> 针对此类长宽表的变形操作设计了一些有关的变形函数。</p>
<section id="pivot">
<h3>1. pivot<a class="headerlink" href="#pivot" title="Permalink to this heading">#</a></h3>
<p><code class="docutils literal notranslate"><span class="pre">pivot</span></code> 是一种典型的长表变宽表的函数，首先来看一个例子：下表存储了张三和李四的语文和数学分数，现在想要把语文和数学分数作为列来展示。</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [5]: </span><span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span><span class="s1">&#39;Class&#39;</span><span class="p">:[</span><span class="mi">1</span><span class="p">,</span><span class="mi">1</span><span class="p">,</span><span class="mi">2</span><span class="p">,</span><span class="mi">2</span><span class="p">],</span>
<span class="gp">   ...: </span>                   <span class="s1">&#39;Name&#39;</span><span class="p">:[</span><span class="s1">&#39;San Zhang&#39;</span><span class="p">,</span><span class="s1">&#39;San Zhang&#39;</span><span class="p">,</span><span class="s1">&#39;Si Li&#39;</span><span class="p">,</span><span class="s1">&#39;Si Li&#39;</span><span class="p">],</span>
<span class="gp">   ...: </span>                   <span class="s1">&#39;Subject&#39;</span><span class="p">:[</span><span class="s1">&#39;Chinese&#39;</span><span class="p">,</span><span class="s1">&#39;Math&#39;</span><span class="p">,</span><span class="s1">&#39;Chinese&#39;</span><span class="p">,</span><span class="s1">&#39;Math&#39;</span><span class="p">],</span>
<span class="gp">   ...: </span>                   <span class="s1">&#39;Grade&#39;</span><span class="p">:[</span><span class="mi">80</span><span class="p">,</span><span class="mi">75</span><span class="p">,</span><span class="mi">90</span><span class="p">,</span><span class="mi">85</span><span class="p">]})</span>
<span class="gp">   ...: </span>

<span class="gp">In [6]: </span><span class="n">df</span>
<span class="gh">Out[6]: </span>
<span class="go">   Class       Name  Subject  Grade</span>
<span class="go">0      1  San Zhang  Chinese     80</span>
<span class="go">1      1  San Zhang     Math     75</span>
<span class="go">2      2      Si Li  Chinese     90</span>
<span class="go">3      2      Si Li     Math     85</span>
</pre></div>
</div>
<p>对于一个基本的长变宽操作而言，最重要的有三个要素，分别是变形后的行索引、需要转到列索引的列，以及这些列和行索引对应的数值，它们分别对应了 <code class="docutils literal notranslate"><span class="pre">pivot</span></code> 方法中的 <code class="docutils literal notranslate"><span class="pre">index,</span> <span class="pre">columns,</span> <span class="pre">values</span></code> 参数。新生成表的列索引是 <code class="docutils literal notranslate"><span class="pre">columns</span></code> 对应列的 <code class="docutils literal notranslate"><span class="pre">unique</span></code> 值，而新表的行索引是 <code class="docutils literal notranslate"><span class="pre">index</span></code> 对应列的 <code class="docutils literal notranslate"><span class="pre">unique</span></code> 值，而 <code class="docutils literal notranslate"><span class="pre">values</span></code> 对应了想要展示的数值列。</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [7]: </span><span class="n">df</span><span class="o">.</span><span class="n">pivot</span><span class="p">(</span><span class="n">index</span><span class="o">=</span><span class="s1">&#39;Name&#39;</span><span class="p">,</span> <span class="n">columns</span><span class="o">=</span><span class="s1">&#39;Subject&#39;</span><span class="p">,</span> <span class="n">values</span><span class="o">=</span><span class="s1">&#39;Grade&#39;</span><span class="p">)</span>
<span class="gh">Out[7]: </span>
<span class="go">Subject    Chinese  Math</span>
<span class="go">Name                    </span>
<span class="go">San Zhang       80    75</span>
<span class="go">Si Li           90    85</span>
</pre></div>
</div>
<p>通过颜色的标记，更容易地能够理解其变形的过程：</p>
<a class="reference internal image-reference" href="../_images/ch5_pivot.png"><img alt="../_images/ch5_pivot.png" class="align-center" src="../_images/ch5_pivot.png" style="width: 300.0px; height: 342.8px;" /></a>
<p>利用 <code class="docutils literal notranslate"><span class="pre">pivot</span></code> 进行变形操作需要满足唯一性的要求，即由于在新表中的行列索引对应了唯一的 <code class="docutils literal notranslate"><span class="pre">value</span></code> ，因此原表中的 <code class="docutils literal notranslate"><span class="pre">index</span></code> 和 <code class="docutils literal notranslate"><span class="pre">columns</span></code> 对应两个列的行组合必须唯一。例如，现在把原表中第二行张三的数学改为语文就会报错，这是由于 <code class="docutils literal notranslate"><span class="pre">Name</span></code> 与 <code class="docutils literal notranslate"><span class="pre">Subject</span></code> 的组合中两次出现 <code class="docutils literal notranslate"><span class="pre">(&quot;San</span> <span class="pre">Zhang&quot;,</span> <span class="pre">&quot;Chinese&quot;)</span></code> ，从而最后不能够确定到底变形后应该是填写80分还是75分。</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [8]: </span><span class="n">df</span><span class="o">.</span><span class="n">loc</span><span class="p">[</span><span class="mi">1</span><span class="p">,</span> <span class="s1">&#39;Subject&#39;</span><span class="p">]</span> <span class="o">=</span> <span class="s1">&#39;Chinese&#39;</span>

<span class="gp">In [9]: </span><span class="k">try</span><span class="p">:</span>
<span class="gp">   ...: </span> <span class="n">df</span><span class="o">.</span><span class="n">pivot</span><span class="p">(</span><span class="n">index</span><span class="o">=</span><span class="s1">&#39;Name&#39;</span><span class="p">,</span> <span class="n">columns</span><span class="o">=</span><span class="s1">&#39;Subject&#39;</span><span class="p">,</span> <span class="n">values</span><span class="o">=</span><span class="s1">&#39;Grade&#39;</span><span class="p">)</span>
<span class="gp">   ...: </span><span class="k">except</span> <span class="ne">Exception</span> <span class="k">as</span> <span class="n">e</span><span class="p">:</span>
<span class="gp">   ...: </span>   <span class="n">Err_Msg</span> <span class="o">=</span> <span class="n">e</span>
<span class="gp">   ...: </span>

<span class="gp">In [10]: </span><span class="n">Err_Msg</span>
<span class="gh">Out[10]: </span><span class="go">ValueError(&#39;Index contains duplicate entries, cannot reshape&#39;)</span>
</pre></div>
</div>
<p><code class="docutils literal notranslate"><span class="pre">pandas</span></code> 从 <code class="docutils literal notranslate"><span class="pre">1.1.0</span></code> 开始， <code class="docutils literal notranslate"><span class="pre">pivot</span></code> 相关的三个参数允许被设置为列表，这也意味着会返回多级索引。这里构造一个相应的例子来说明如何使用：下表中六列分别为班级、姓名、测试类型（期中考试和期末考试）、科目、成绩、排名。</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [11]: </span><span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span><span class="s1">&#39;Class&#39;</span><span class="p">:[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">,</span> <span class="mi">2</span><span class="p">],</span>
<span class="gp">   ....: </span>                  <span class="s1">&#39;Name&#39;</span><span class="p">:[</span><span class="s1">&#39;San Zhang&#39;</span><span class="p">,</span> <span class="s1">&#39;San Zhang&#39;</span><span class="p">,</span> <span class="s1">&#39;Si Li&#39;</span><span class="p">,</span> <span class="s1">&#39;Si Li&#39;</span><span class="p">,</span>
<span class="gp">   ....: </span>                           <span class="s1">&#39;San Zhang&#39;</span><span class="p">,</span> <span class="s1">&#39;San Zhang&#39;</span><span class="p">,</span> <span class="s1">&#39;Si Li&#39;</span><span class="p">,</span> <span class="s1">&#39;Si Li&#39;</span><span class="p">],</span>
<span class="gp">   ....: </span>                  <span class="s1">&#39;Examination&#39;</span><span class="p">:</span> <span class="p">[</span><span class="s1">&#39;Mid&#39;</span><span class="p">,</span> <span class="s1">&#39;Final&#39;</span><span class="p">,</span> <span class="s1">&#39;Mid&#39;</span><span class="p">,</span> <span class="s1">&#39;Final&#39;</span><span class="p">,</span>
<span class="gp">   ....: </span>                                 <span class="s1">&#39;Mid&#39;</span><span class="p">,</span> <span class="s1">&#39;Final&#39;</span><span class="p">,</span> <span class="s1">&#39;Mid&#39;</span><span class="p">,</span> <span class="s1">&#39;Final&#39;</span><span class="p">],</span>
<span class="gp">   ....: </span>                  <span class="s1">&#39;Subject&#39;</span><span class="p">:[</span><span class="s1">&#39;Chinese&#39;</span><span class="p">,</span> <span class="s1">&#39;Chinese&#39;</span><span class="p">,</span> <span class="s1">&#39;Chinese&#39;</span><span class="p">,</span> <span class="s1">&#39;Chinese&#39;</span><span class="p">,</span>
<span class="gp">   ....: </span>                              <span class="s1">&#39;Math&#39;</span><span class="p">,</span> <span class="s1">&#39;Math&#39;</span><span class="p">,</span> <span class="s1">&#39;Math&#39;</span><span class="p">,</span> <span class="s1">&#39;Math&#39;</span><span class="p">],</span>
<span class="gp">   ....: </span>                  <span class="s1">&#39;Grade&#39;</span><span class="p">:[</span><span class="mi">80</span><span class="p">,</span> <span class="mi">75</span><span class="p">,</span> <span class="mi">85</span><span class="p">,</span> <span class="mi">65</span><span class="p">,</span> <span class="mi">90</span><span class="p">,</span> <span class="mi">85</span><span class="p">,</span> <span class="mi">92</span><span class="p">,</span> <span class="mi">88</span><span class="p">],</span>
<span class="gp">   ....: </span>                  <span class="s1">&#39;rank&#39;</span><span class="p">:[</span><span class="mi">10</span><span class="p">,</span> <span class="mi">15</span><span class="p">,</span> <span class="mi">21</span><span class="p">,</span> <span class="mi">15</span><span class="p">,</span> <span class="mi">20</span><span class="p">,</span> <span class="mi">7</span><span class="p">,</span> <span class="mi">6</span><span class="p">,</span> <span class="mi">2</span><span class="p">]})</span>
<span class="gp">   ....: </span>

<span class="gp">In [12]: </span><span class="n">df</span>
<span class="gh">Out[12]: </span>
<span class="go">   Class       Name Examination  Subject  Grade  rank</span>
<span class="go">0      1  San Zhang         Mid  Chinese     80    10</span>
<span class="go">1      1  San Zhang       Final  Chinese     75    15</span>
<span class="go">2      2      Si Li         Mid  Chinese     85    21</span>
<span class="go">3      2      Si Li       Final  Chinese     65    15</span>
<span class="go">4      1  San Zhang         Mid     Math     90    20</span>
<span class="go">5      1  San Zhang       Final     Math     85     7</span>
<span class="go">6      2      Si Li         Mid     Math     92     6</span>
<span class="go">7      2      Si Li       Final     Math     88     2</span>
</pre></div>
</div>
<p>现在想要把测试类型和科目联合组成的四个类别（期中语文、期末语文、期中数学、期末数学）转到列索引，并且同时统计成绩和排名：</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [13]: </span><span class="n">pivot_multi</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">pivot</span><span class="p">(</span><span class="n">index</span> <span class="o">=</span> <span class="p">[</span><span class="s1">&#39;Class&#39;</span><span class="p">,</span> <span class="s1">&#39;Name&#39;</span><span class="p">],</span>
<span class="gp">   ....: </span>                       <span class="n">columns</span> <span class="o">=</span> <span class="p">[</span><span class="s1">&#39;Subject&#39;</span><span class="p">,</span><span class="s1">&#39;Examination&#39;</span><span class="p">],</span>
<span class="gp">   ....: </span>                       <span class="n">values</span> <span class="o">=</span> <span class="p">[</span><span class="s1">&#39;Grade&#39;</span><span class="p">,</span><span class="s1">&#39;rank&#39;</span><span class="p">])</span>
<span class="gp">   ....: </span>

<span class="gp">In [14]: </span><span class="n">pivot_multi</span>
<span class="gh">Out[14]: </span>
<span class="go">                  Grade                     rank                 </span>
<span class="go">Subject         Chinese       Math       Chinese       Math      </span>
<span class="go">Examination         Mid Final  Mid Final     Mid Final  Mid Final</span>
<span class="go">Class Name                                                       </span>
<span class="go">1     San Zhang      80    75   90    85      10    15   20     7</span>
<span class="go">2     Si Li          85    65   92    88      21    15    6     2</span>
</pre></div>
</div>
<p>根据唯一性原则，新表的行索引等价于对 <code class="docutils literal notranslate"><span class="pre">index</span></code> 中的多列使用 <code class="docutils literal notranslate"><span class="pre">drop_duplicates</span></code> ，而列索引的长度为 <code class="docutils literal notranslate"><span class="pre">values</span></code> 中的元素个数乘以 <code class="docutils literal notranslate"><span class="pre">columns</span></code> 的唯一组合数量（与 <code class="docutils literal notranslate"><span class="pre">index</span></code> 类似） 。从下面的示意图中能够比较容易地理解相应的操作：</p>
<a class="reference internal image-reference" href="../_images/ch5_mulpivot.png"><img alt="../_images/ch5_mulpivot.png" class="align-center" src="../_images/ch5_mulpivot.png" style="width: 430.0px; height: 484.18px;" /></a>
</section>
<section id="pivot-table">
<h3>2. pivot_table<a class="headerlink" href="#pivot-table" title="Permalink to this heading">#</a></h3>
<p><code class="docutils literal notranslate"><span class="pre">pivot</span></code> 的使用依赖于唯一性条件，那如果不满足唯一性条件，那么必须通过聚合操作使得相同行列组合对应的多个值变为一个值。例如，张三和李四都参加了两次语文考试和数学考试，按照学院规定，最后的成绩是两次考试分数的平均值，此时就无法通过 <code class="docutils literal notranslate"><span class="pre">pivot</span></code> 函数来完成。</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [15]: </span><span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span><span class="s1">&#39;Name&#39;</span><span class="p">:[</span><span class="s1">&#39;San Zhang&#39;</span><span class="p">,</span> <span class="s1">&#39;San Zhang&#39;</span><span class="p">,</span>
<span class="gp">   ....: </span>                           <span class="s1">&#39;San Zhang&#39;</span><span class="p">,</span> <span class="s1">&#39;San Zhang&#39;</span><span class="p">,</span>
<span class="gp">   ....: </span>                           <span class="s1">&#39;Si Li&#39;</span><span class="p">,</span> <span class="s1">&#39;Si Li&#39;</span><span class="p">,</span> <span class="s1">&#39;Si Li&#39;</span><span class="p">,</span> <span class="s1">&#39;Si Li&#39;</span><span class="p">],</span>
<span class="gp">   ....: </span>                  <span class="s1">&#39;Subject&#39;</span><span class="p">:[</span><span class="s1">&#39;Chinese&#39;</span><span class="p">,</span> <span class="s1">&#39;Chinese&#39;</span><span class="p">,</span> <span class="s1">&#39;Math&#39;</span><span class="p">,</span> <span class="s1">&#39;Math&#39;</span><span class="p">,</span>
<span class="gp">   ....: </span>                              <span class="s1">&#39;Chinese&#39;</span><span class="p">,</span> <span class="s1">&#39;Chinese&#39;</span><span class="p">,</span> <span class="s1">&#39;Math&#39;</span><span class="p">,</span> <span class="s1">&#39;Math&#39;</span><span class="p">],</span>
<span class="gp">   ....: </span>                  <span class="s1">&#39;Grade&#39;</span><span class="p">:[</span><span class="mi">80</span><span class="p">,</span> <span class="mi">90</span><span class="p">,</span> <span class="mi">100</span><span class="p">,</span> <span class="mi">90</span><span class="p">,</span> <span class="mi">70</span><span class="p">,</span> <span class="mi">80</span><span class="p">,</span> <span class="mi">85</span><span class="p">,</span> <span class="mi">95</span><span class="p">]})</span>
<span class="gp">   ....: </span>

<span class="gp">In [16]: </span><span class="n">df</span>
<span class="gh">Out[16]: </span>
<span class="go">        Name  Subject  Grade</span>
<span class="go">0  San Zhang  Chinese     80</span>
<span class="go">1  San Zhang  Chinese     90</span>
<span class="go">2  San Zhang     Math    100</span>
<span class="go">3  San Zhang     Math     90</span>
<span class="go">4      Si Li  Chinese     70</span>
<span class="go">5      Si Li  Chinese     80</span>
<span class="go">6      Si Li     Math     85</span>
<span class="go">7      Si Li     Math     95</span>
</pre></div>
</div>
<p><code class="docutils literal notranslate"><span class="pre">pandas</span></code> 中提供了 <code class="docutils literal notranslate"><span class="pre">pivot_table</span></code> 来实现，其中的 <code class="docutils literal notranslate"><span class="pre">aggfunc</span></code> 参数就是使用的聚合函数。上述场景可以如下写出：</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [17]: </span><span class="n">df</span><span class="o">.</span><span class="n">pivot_table</span><span class="p">(</span><span class="n">index</span> <span class="o">=</span> <span class="s1">&#39;Name&#39;</span><span class="p">,</span>
<span class="gp">   ....: </span>               <span class="n">columns</span> <span class="o">=</span> <span class="s1">&#39;Subject&#39;</span><span class="p">,</span>
<span class="gp">   ....: </span>               <span class="n">values</span> <span class="o">=</span> <span class="s1">&#39;Grade&#39;</span><span class="p">,</span>
<span class="gp">   ....: </span>               <span class="n">aggfunc</span> <span class="o">=</span> <span class="s1">&#39;mean&#39;</span><span class="p">)</span>
<span class="gp">   ....: </span>
<span class="gh">Out[17]: </span>
<span class="go">Subject    Chinese  Math</span>
<span class="go">Name                    </span>
<span class="go">San Zhang       85    95</span>
<span class="go">Si Li           75    90</span>
</pre></div>
</div>
<p>这里传入 <code class="docutils literal notranslate"><span class="pre">aggfunc</span></code> 包含了上一章中介绍的所有合法聚合字符串，此外还可以传入以序列为输入标量为输出的聚合函数来实现自定义操作，上述功能可以等价写出：</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [18]: </span><span class="n">df</span><span class="o">.</span><span class="n">pivot_table</span><span class="p">(</span><span class="n">index</span> <span class="o">=</span> <span class="s1">&#39;Name&#39;</span><span class="p">,</span>
<span class="gp">   ....: </span>               <span class="n">columns</span> <span class="o">=</span> <span class="s1">&#39;Subject&#39;</span><span class="p">,</span>
<span class="gp">   ....: </span>               <span class="n">values</span> <span class="o">=</span> <span class="s1">&#39;Grade&#39;</span><span class="p">,</span>
<span class="gp">   ....: </span>               <span class="n">aggfunc</span> <span class="o">=</span> <span class="k">lambda</span> <span class="n">x</span><span class="p">:</span><span class="n">x</span><span class="o">.</span><span class="n">mean</span><span class="p">())</span>
<span class="gp">   ....: </span>
<span class="gh">Out[18]: </span>
<span class="go">Subject    Chinese  Math</span>
<span class="go">Name                    </span>
<span class="go">San Zhang       85    95</span>
<span class="go">Si Li           75    90</span>
</pre></div>
</div>
<p>此外， <code class="docutils literal notranslate"><span class="pre">pivot_table</span></code> 具有边际汇总的功能，可以通过设置 <code class="docutils literal notranslate"><span class="pre">margins=True</span></code> 来实现，其中边际的聚合方式与 <code class="docutils literal notranslate"><span class="pre">aggfunc</span></code> 中给出的聚合方法一致。下面就分别统计了语文均分和数学均分、张三均分和李四均分，以及总体所有分数的均分：</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [19]: </span><span class="n">df</span><span class="o">.</span><span class="n">pivot_table</span><span class="p">(</span><span class="n">index</span> <span class="o">=</span> <span class="s1">&#39;Name&#39;</span><span class="p">,</span>
<span class="gp">   ....: </span>               <span class="n">columns</span> <span class="o">=</span> <span class="s1">&#39;Subject&#39;</span><span class="p">,</span>
<span class="gp">   ....: </span>               <span class="n">values</span> <span class="o">=</span> <span class="s1">&#39;Grade&#39;</span><span class="p">,</span>
<span class="gp">   ....: </span>               <span class="n">aggfunc</span><span class="o">=</span><span class="s1">&#39;mean&#39;</span><span class="p">,</span>
<span class="gp">   ....: </span>               <span class="n">margins</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="gp">   ....: </span>
<span class="gh">Out[19]: </span>
<span class="go">Subject    Chinese  Math    All</span>
<span class="go">Name                           </span>
<span class="go">San Zhang       85  95.0  90.00</span>
<span class="go">Si Li           75  90.0  82.50</span>
<span class="go">All             80  92.5  86.25</span>
</pre></div>
</div>
<div class="hint admonition">
<p class="admonition-title">练一练</p>
<blockquote>
<div><p>在上面的边际汇总例子中，行或列的汇总为新表中行元素或者列元素的平均值，而总体的汇总为新表中四个元素的平均值。这种关系一定成立吗？若不成立，请给出一个例子来说明。</p>
</div></blockquote>
</div>
</section>
<section id="melt">
<h3>3. melt<a class="headerlink" href="#melt" title="Permalink to this heading">#</a></h3>
<p>长宽表只是数据呈现方式的差异，但其包含的信息量是等价的，前面提到了利用 <code class="docutils literal notranslate"><span class="pre">pivot</span></code> 把长表转为宽表，那么就可以通过相应的逆操作把宽表转为长表， <code class="docutils literal notranslate"><span class="pre">melt</span></code> 函数就起到了这样的作用。在下面的例子中， <code class="docutils literal notranslate"><span class="pre">Subject</span></code> 以列索引的形式存储，现在想要将其压缩到一个列中。</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [20]: </span><span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span><span class="s1">&#39;Class&#39;</span><span class="p">:[</span><span class="mi">1</span><span class="p">,</span><span class="mi">2</span><span class="p">],</span>
<span class="gp">   ....: </span>                  <span class="s1">&#39;Name&#39;</span><span class="p">:[</span><span class="s1">&#39;San Zhang&#39;</span><span class="p">,</span> <span class="s1">&#39;Si Li&#39;</span><span class="p">],</span>
<span class="gp">   ....: </span>                  <span class="s1">&#39;Chinese&#39;</span><span class="p">:[</span><span class="mi">80</span><span class="p">,</span> <span class="mi">90</span><span class="p">],</span>
<span class="gp">   ....: </span>                  <span class="s1">&#39;Math&#39;</span><span class="p">:[</span><span class="mi">80</span><span class="p">,</span> <span class="mi">75</span><span class="p">]})</span>
<span class="gp">   ....: </span>

<span class="gp">In [21]: </span><span class="n">df</span>
<span class="gh">Out[21]: </span>
<span class="go">   Class       Name  Chinese  Math</span>
<span class="go">0      1  San Zhang       80    80</span>
<span class="go">1      2      Si Li       90    75</span>

<span class="gp">In [22]: </span><span class="n">df_melted</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">melt</span><span class="p">(</span><span class="n">id_vars</span> <span class="o">=</span> <span class="p">[</span><span class="s1">&#39;Class&#39;</span><span class="p">,</span> <span class="s1">&#39;Name&#39;</span><span class="p">],</span>
<span class="gp">   ....: </span>                    <span class="n">value_vars</span> <span class="o">=</span> <span class="p">[</span><span class="s1">&#39;Chinese&#39;</span><span class="p">,</span> <span class="s1">&#39;Math&#39;</span><span class="p">],</span>
<span class="gp">   ....: </span>                    <span class="n">var_name</span> <span class="o">=</span> <span class="s1">&#39;Subject&#39;</span><span class="p">,</span>
<span class="gp">   ....: </span>                    <span class="n">value_name</span> <span class="o">=</span> <span class="s1">&#39;Grade&#39;</span><span class="p">)</span>
<span class="gp">   ....: </span>

<span class="gp">In [23]: </span><span class="n">df_melted</span>
<span class="gh">Out[23]: </span>
<span class="go">   Class       Name  Subject  Grade</span>
<span class="go">0      1  San Zhang  Chinese     80</span>
<span class="go">1      2      Si Li  Chinese     90</span>
<span class="go">2      1  San Zhang     Math     80</span>
<span class="go">3      2      Si Li     Math     75</span>
</pre></div>
</div>
<p><code class="docutils literal notranslate"><span class="pre">melt</span></code> 的主要参数和压缩的过程如下图所示：</p>
<a class="reference internal image-reference" href="../_images/ch5_melt.png"><img alt="../_images/ch5_melt.png" class="align-center" src="../_images/ch5_melt.png" style="width: 500.0px; height: 480.0px;" /></a>
<p>前面提到了 <code class="docutils literal notranslate"><span class="pre">melt</span></code> 和 <code class="docutils literal notranslate"><span class="pre">pivot</span></code> 是一组互逆过程，那么就一定可以通过 <code class="docutils literal notranslate"><span class="pre">pivot</span></code> 操作把 <code class="docutils literal notranslate"><span class="pre">df_melted</span></code> 转回 <code class="docutils literal notranslate"><span class="pre">df</span></code> 的形式：</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [24]: </span><span class="n">df_unmelted</span> <span class="o">=</span> <span class="n">df_melted</span><span class="o">.</span><span class="n">pivot</span><span class="p">(</span><span class="n">index</span> <span class="o">=</span> <span class="p">[</span><span class="s1">&#39;Class&#39;</span><span class="p">,</span> <span class="s1">&#39;Name&#39;</span><span class="p">],</span>
<span class="gp">   ....: </span>                              <span class="n">columns</span><span class="o">=</span><span class="s1">&#39;Subject&#39;</span><span class="p">,</span>
<span class="gp">   ....: </span>                              <span class="n">values</span><span class="o">=</span><span class="s1">&#39;Grade&#39;</span><span class="p">)</span>
<span class="gp">   ....: </span>

<span class="gp">In [25]: </span><span class="n">df_unmelted</span> <span class="c1"># 下面需要恢复索引，并且重命名列索引名称</span>
<span class="gh">Out[25]: </span>
<span class="go">Subject          Chinese  Math</span>
<span class="go">Class Name                    </span>
<span class="go">1     San Zhang       80    80</span>
<span class="go">2     Si Li           90    75</span>

<span class="gp">In [26]: </span><span class="n">df_unmelted</span> <span class="o">=</span> <span class="n">df_unmelted</span><span class="o">.</span><span class="n">reset_index</span><span class="p">()</span><span class="o">.</span><span class="n">rename_axis</span><span class="p">(</span>
<span class="gp">   ....: </span>                             <span class="n">columns</span><span class="o">=</span><span class="p">{</span><span class="s1">&#39;Subject&#39;</span><span class="p">:</span><span class="s1">&#39;&#39;</span><span class="p">})</span>
<span class="gp">   ....: </span>

<span class="gp">In [27]: </span><span class="n">df_unmelted</span><span class="o">.</span><span class="n">equals</span><span class="p">(</span><span class="n">df</span><span class="p">)</span>
<span class="gh">Out[27]: </span><span class="go">True</span>
</pre></div>
</div>
</section>
<section id="wide-to-long">
<h3>4. wide_to_long<a class="headerlink" href="#wide-to-long" title="Permalink to this heading">#</a></h3>
<p><code class="docutils literal notranslate"><span class="pre">melt</span></code> 方法中，在列索引中被压缩的一组值对应的列元素只能代表同一层次的含义，即 <code class="docutils literal notranslate"><span class="pre">values_name</span></code> 。现在如果列中包含了交叉类别，比如期中期末的类别和语文数学的类别，那么想要把 <code class="docutils literal notranslate"><span class="pre">values_name</span></code> 对应的 <code class="docutils literal notranslate"><span class="pre">Grade</span></code> 扩充为两列分别对应语文分数和数学分数，只把期中期末的信息压缩，这种需求下就要使用 <code class="docutils literal notranslate"><span class="pre">wide_to_long</span></code> 函数来完成。</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [28]: </span><span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span><span class="s1">&#39;Class&#39;</span><span class="p">:[</span><span class="mi">1</span><span class="p">,</span><span class="mi">2</span><span class="p">],</span><span class="s1">&#39;Name&#39;</span><span class="p">:[</span><span class="s1">&#39;San Zhang&#39;</span><span class="p">,</span> <span class="s1">&#39;Si Li&#39;</span><span class="p">],</span>
<span class="gp">   ....: </span>                   <span class="s1">&#39;Chinese_Mid&#39;</span><span class="p">:[</span><span class="mi">80</span><span class="p">,</span> <span class="mi">75</span><span class="p">],</span> <span class="s1">&#39;Math_Mid&#39;</span><span class="p">:[</span><span class="mi">90</span><span class="p">,</span> <span class="mi">85</span><span class="p">],</span>
<span class="gp">   ....: </span>                   <span class="s1">&#39;Chinese_Final&#39;</span><span class="p">:[</span><span class="mi">80</span><span class="p">,</span> <span class="mi">75</span><span class="p">],</span> <span class="s1">&#39;Math_Final&#39;</span><span class="p">:[</span><span class="mi">90</span><span class="p">,</span> <span class="mi">85</span><span class="p">]})</span>
<span class="gp">   ....: </span>

<span class="gp">In [29]: </span><span class="n">df</span>
<span class="gh">Out[29]: </span>
<span class="go">   Class       Name  Chinese_Mid  Math_Mid  Chinese_Final  Math_Final</span>
<span class="go">0      1  San Zhang           80        90             80          90</span>
<span class="go">1      2      Si Li           75        85             75          85</span>

<span class="gp">In [30]: </span><span class="n">pd</span><span class="o">.</span><span class="n">wide_to_long</span><span class="p">(</span><span class="n">df</span><span class="p">,</span>
<span class="gp">   ....: </span>                <span class="n">stubnames</span><span class="o">=</span><span class="p">[</span><span class="s1">&#39;Chinese&#39;</span><span class="p">,</span> <span class="s1">&#39;Math&#39;</span><span class="p">],</span>
<span class="gp">   ....: </span>                <span class="n">i</span> <span class="o">=</span> <span class="p">[</span><span class="s1">&#39;Class&#39;</span><span class="p">,</span> <span class="s1">&#39;Name&#39;</span><span class="p">],</span>
<span class="gp">   ....: </span>                <span class="n">j</span><span class="o">=</span><span class="s1">&#39;Examination&#39;</span><span class="p">,</span>
<span class="gp">   ....: </span>                <span class="n">sep</span><span class="o">=</span><span class="s1">&#39;_&#39;</span><span class="p">,</span>
<span class="gp">   ....: </span>                <span class="n">suffix</span><span class="o">=</span><span class="s1">&#39;.+&#39;</span><span class="p">)</span>
<span class="gp">   ....: </span>
<span class="gh">Out[30]: </span>
<span class="go">                             Chinese  Math</span>
<span class="go">Class Name      Examination               </span>
<span class="go">1     San Zhang Mid               80    90</span>
<span class="go">                Final             80    90</span>
<span class="go">2     Si Li     Mid               75    85</span>
<span class="go">                Final             75    85</span>
</pre></div>
</div>
<p>具体的变换过程由下图进行展示，属相同概念的元素使用了一致的颜色标出：</p>
<a class="reference internal image-reference" href="../_images/ch5_wtl.png"><img alt="../_images/ch5_wtl.png" class="align-center" src="../_images/ch5_wtl.png" style="width: 500.0px; height: 480.0px;" /></a>
<p>下面给出一个比较复杂的案例，把之前在 <code class="docutils literal notranslate"><span class="pre">pivot</span></code> 一节中多列操作的结果（产生了多级索引），利用 <code class="docutils literal notranslate"><span class="pre">wide_to_long</span></code> 函数，将其转为原来的形态。其中，使用了第八章的 <code class="docutils literal notranslate"><span class="pre">str.split</span></code> 函数，目前暂时只需将其理解为对序列按照某个分隔符进行拆分即可。</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [31]: </span><span class="n">res</span> <span class="o">=</span> <span class="n">pivot_multi</span><span class="o">.</span><span class="n">copy</span><span class="p">()</span>

<span class="gp">In [32]: </span><span class="n">res</span><span class="o">.</span><span class="n">columns</span> <span class="o">=</span> <span class="n">res</span><span class="o">.</span><span class="n">columns</span><span class="o">.</span><span class="n">map</span><span class="p">(</span><span class="k">lambda</span> <span class="n">x</span><span class="p">:</span><span class="s1">&#39;_&#39;</span><span class="o">.</span><span class="n">join</span><span class="p">(</span><span class="n">x</span><span class="p">))</span>

<span class="gp">In [33]: </span><span class="n">res</span> <span class="o">=</span> <span class="n">res</span><span class="o">.</span><span class="n">reset_index</span><span class="p">()</span>

<span class="gp">In [34]: </span><span class="n">res</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">wide_to_long</span><span class="p">(</span><span class="n">res</span><span class="p">,</span> <span class="n">stubnames</span><span class="o">=</span><span class="p">[</span><span class="s1">&#39;Grade&#39;</span><span class="p">,</span> <span class="s1">&#39;rank&#39;</span><span class="p">],</span>
<span class="gp">   ....: </span>                           <span class="n">i</span> <span class="o">=</span> <span class="p">[</span><span class="s1">&#39;Class&#39;</span><span class="p">,</span> <span class="s1">&#39;Name&#39;</span><span class="p">],</span>
<span class="gp">   ....: </span>                           <span class="n">j</span> <span class="o">=</span> <span class="s1">&#39;Subject_Examination&#39;</span><span class="p">,</span>
<span class="gp">   ....: </span>                           <span class="n">sep</span> <span class="o">=</span> <span class="s1">&#39;_&#39;</span><span class="p">,</span>
<span class="gp">   ....: </span>                           <span class="n">suffix</span> <span class="o">=</span> <span class="s1">&#39;.+&#39;</span><span class="p">)</span>
<span class="gp">   ....: </span>

<span class="gp">In [35]: </span><span class="n">res</span>
<span class="gh">Out[35]: </span>
<span class="go">                                     Grade  rank</span>
<span class="go">Class Name      Subject_Examination             </span>
<span class="go">1     San Zhang Chinese_Mid             80    10</span>
<span class="go">                Chinese_Final           75    15</span>
<span class="go">                Math_Mid                90    20</span>
<span class="go">                Math_Final              85     7</span>
<span class="go">2     Si Li     Chinese_Mid             85    21</span>
<span class="go">                Chinese_Final           65    15</span>
<span class="go">                Math_Mid                92     6</span>
<span class="go">                Math_Final              88     2</span>
</pre></div>
</div>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [36]: </span><span class="n">res</span> <span class="o">=</span> <span class="n">res</span><span class="o">.</span><span class="n">reset_index</span><span class="p">()</span>

<span class="gp">In [37]: </span><span class="n">res</span><span class="p">[[</span><span class="s1">&#39;Subject&#39;</span><span class="p">,</span> <span class="s1">&#39;Examination&#39;</span><span class="p">]]</span> <span class="o">=</span> <span class="n">res</span><span class="p">[</span>
<span class="gp">   ....: </span>                <span class="s1">&#39;Subject_Examination&#39;</span><span class="p">]</span><span class="o">.</span><span class="n">str</span><span class="o">.</span><span class="n">split</span><span class="p">(</span><span class="s1">&#39;_&#39;</span><span class="p">,</span> <span class="n">expand</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="gp">   ....: </span>

<span class="gp">In [38]: </span><span class="n">res</span> <span class="o">=</span> <span class="n">res</span><span class="p">[[</span><span class="s1">&#39;Class&#39;</span><span class="p">,</span> <span class="s1">&#39;Name&#39;</span><span class="p">,</span> <span class="s1">&#39;Examination&#39;</span><span class="p">,</span>
<span class="gp">   ....: </span>           <span class="s1">&#39;Subject&#39;</span><span class="p">,</span> <span class="s1">&#39;Grade&#39;</span><span class="p">,</span> <span class="s1">&#39;rank&#39;</span><span class="p">]]</span><span class="o">.</span><span class="n">sort_values</span><span class="p">(</span><span class="s1">&#39;Subject&#39;</span><span class="p">)</span>
<span class="gp">   ....: </span>

<span class="gp">In [39]: </span><span class="n">res</span> <span class="o">=</span> <span class="n">res</span><span class="o">.</span><span class="n">reset_index</span><span class="p">(</span><span class="n">drop</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>

<span class="gp">In [40]: </span><span class="n">res</span>
<span class="gh">Out[40]: </span>
<span class="go">   Class       Name Examination  Subject  Grade  rank</span>
<span class="go">0      1  San Zhang         Mid  Chinese     80    10</span>
<span class="go">1      1  San Zhang       Final  Chinese     75    15</span>
<span class="go">2      2      Si Li         Mid  Chinese     85    21</span>
<span class="go">3      2      Si Li       Final  Chinese     65    15</span>
<span class="go">4      1  San Zhang         Mid     Math     90    20</span>
<span class="go">5      1  San Zhang       Final     Math     85     7</span>
<span class="go">6      2      Si Li         Mid     Math     92     6</span>
<span class="go">7      2      Si Li       Final     Math     88     2</span>
</pre></div>
</div>
</section>
</section>
<section id="id3">
<h2>二、索引的变形<a class="headerlink" href="#id3" title="Permalink to this heading">#</a></h2>
<section id="stackunstack">
<h3>1. stack与unstack<a class="headerlink" href="#stackunstack" title="Permalink to this heading">#</a></h3>
<p>在第二章中提到了利用 <code class="docutils literal notranslate"><span class="pre">swaplevel</span></code> 或者 <code class="docutils literal notranslate"><span class="pre">reorder_levels</span></code> 进行索引内部的层交换，下面就要讨论 <span class="red">行列索引之间</span> 的交换，由于这种交换带来了 <code class="docutils literal notranslate"><span class="pre">DataFrame</span></code> 维度上的变化，因此属于变形操作。在第一节中提到的4种变形函数与其不同之处在于，它们都属于某一列或几列 <span class="red">元素</span> 和 <span class="red">列索引</span> 之间的转换，而不是索引之间的转换。</p>
<p><code class="docutils literal notranslate"><span class="pre">unstack</span></code> 函数的作用是把行索引转为列索引，例如下面这个简单的例子：</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [41]: </span><span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">ones</span><span class="p">((</span><span class="mi">4</span><span class="p">,</span><span class="mi">2</span><span class="p">)),</span>
<span class="gp">   ....: </span>                  <span class="n">index</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">Index</span><span class="p">([(</span><span class="s1">&#39;A&#39;</span><span class="p">,</span> <span class="s1">&#39;cat&#39;</span><span class="p">,</span> <span class="s1">&#39;big&#39;</span><span class="p">),</span>
<span class="gp">   ....: </span>                                    <span class="p">(</span><span class="s1">&#39;A&#39;</span><span class="p">,</span> <span class="s1">&#39;dog&#39;</span><span class="p">,</span> <span class="s1">&#39;small&#39;</span><span class="p">),</span>
<span class="gp">   ....: </span>                                    <span class="p">(</span><span class="s1">&#39;B&#39;</span><span class="p">,</span> <span class="s1">&#39;cat&#39;</span><span class="p">,</span> <span class="s1">&#39;big&#39;</span><span class="p">),</span>
<span class="gp">   ....: </span>                                    <span class="p">(</span><span class="s1">&#39;B&#39;</span><span class="p">,</span> <span class="s1">&#39;dog&#39;</span><span class="p">,</span> <span class="s1">&#39;small&#39;</span><span class="p">)]),</span>
<span class="gp">   ....: </span>                  <span class="n">columns</span><span class="o">=</span><span class="p">[</span><span class="s1">&#39;col_1&#39;</span><span class="p">,</span> <span class="s1">&#39;col_2&#39;</span><span class="p">])</span>
<span class="gp">   ....: </span>

<span class="gp">In [42]: </span><span class="n">df</span>
<span class="gh">Out[42]: </span>
<span class="go">             col_1  col_2</span>
<span class="go">A cat big      1.0    1.0</span>
<span class="go">  dog small    1.0    1.0</span>
<span class="go">B cat big      1.0    1.0</span>
<span class="go">  dog small    1.0    1.0</span>

<span class="gp">In [43]: </span><span class="n">df</span><span class="o">.</span><span class="n">unstack</span><span class="p">()</span>
<span class="gh">Out[43]: </span>
<span class="go">      col_1       col_2      </span>
<span class="go">        big small   big small</span>
<span class="go">A cat   1.0   NaN   1.0   NaN</span>
<span class="go">  dog   NaN   1.0   NaN   1.0</span>
<span class="go">B cat   1.0   NaN   1.0   NaN</span>
<span class="go">  dog   NaN   1.0   NaN   1.0</span>
</pre></div>
</div>
<p><code class="docutils literal notranslate"><span class="pre">unstack</span></code> 的主要参数是移动的层号，默认转化最内层，移动到列索引的最内层，同时支持同时转化多个层：</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [44]: </span><span class="n">df</span><span class="o">.</span><span class="n">unstack</span><span class="p">(</span><span class="mi">2</span><span class="p">)</span>
<span class="gh">Out[44]: </span>
<span class="go">      col_1       col_2      </span>
<span class="go">        big small   big small</span>
<span class="go">A cat   1.0   NaN   1.0   NaN</span>
<span class="go">  dog   NaN   1.0   NaN   1.0</span>
<span class="go">B cat   1.0   NaN   1.0   NaN</span>
<span class="go">  dog   NaN   1.0   NaN   1.0</span>

<span class="gp">In [45]: </span><span class="n">df</span><span class="o">.</span><span class="n">unstack</span><span class="p">([</span><span class="mi">0</span><span class="p">,</span><span class="mi">2</span><span class="p">])</span>
<span class="gh">Out[45]: </span>
<span class="go">    col_1                  col_2                 </span>
<span class="go">        A          B           A          B      </span>
<span class="go">      big small  big small   big small  big small</span>
<span class="go">cat   1.0   NaN  1.0   NaN   1.0   NaN  1.0   NaN</span>
<span class="go">dog   NaN   1.0  NaN   1.0   NaN   1.0  NaN   1.0</span>
</pre></div>
</div>
<p>类似于 <code class="docutils literal notranslate"><span class="pre">pivot</span></code> 中的唯一性要求，在 <code class="docutils literal notranslate"><span class="pre">unstack</span></code> 中必须保证 <span class="red">被转为列索引的行索引层</span> 和 <span class="red">被保留的行索引层</span> 构成的组合是唯一的，例如把前两个列索引改成相同的破坏唯一性，那么就会报错：</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [46]: </span><span class="n">my_index</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">index</span><span class="o">.</span><span class="n">to_list</span><span class="p">()</span>

<span class="gp">In [47]: </span><span class="n">my_index</span><span class="p">[</span><span class="mi">1</span><span class="p">]</span> <span class="o">=</span> <span class="n">my_index</span><span class="p">[</span><span class="mi">0</span><span class="p">]</span>

<span class="gp">In [48]: </span><span class="n">df</span><span class="o">.</span><span class="n">index</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">Index</span><span class="p">(</span><span class="n">my_index</span><span class="p">)</span>

<span class="gp">In [49]: </span><span class="n">df</span>
<span class="gh">Out[49]: </span>
<span class="go">             col_1  col_2</span>
<span class="go">A cat big      1.0    1.0</span>
<span class="go">      big      1.0    1.0</span>
<span class="go">B cat big      1.0    1.0</span>
<span class="go">  dog small    1.0    1.0</span>

<span class="gp">In [50]: </span><span class="k">try</span><span class="p">:</span>
<span class="gp">   ....: </span>   <span class="n">df</span><span class="o">.</span><span class="n">unstack</span><span class="p">()</span>
<span class="gp">   ....: </span><span class="k">except</span> <span class="ne">Exception</span> <span class="k">as</span> <span class="n">e</span><span class="p">:</span>
<span class="gp">   ....: </span>   <span class="n">Err_Msg</span> <span class="o">=</span> <span class="n">e</span>
<span class="gp">   ....: </span>

<span class="gp">In [51]: </span><span class="n">Err_Msg</span>
<span class="gh">Out[51]: </span><span class="go">ValueError(&#39;Index contains duplicate entries, cannot reshape&#39;)</span>
</pre></div>
</div>
<p>与 <code class="docutils literal notranslate"><span class="pre">unstack</span></code> 相反， <code class="docutils literal notranslate"><span class="pre">stack</span></code> 的作用就是把列索引的层压入行索引，其用法完全类似。</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [52]: </span><span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">(</span><span class="n">np</span><span class="o">.</span><span class="n">ones</span><span class="p">((</span><span class="mi">4</span><span class="p">,</span><span class="mi">2</span><span class="p">)),</span>
<span class="gp">   ....: </span>                  <span class="n">index</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">Index</span><span class="p">([(</span><span class="s1">&#39;A&#39;</span><span class="p">,</span> <span class="s1">&#39;cat&#39;</span><span class="p">,</span> <span class="s1">&#39;big&#39;</span><span class="p">),</span>
<span class="gp">   ....: </span>                                    <span class="p">(</span><span class="s1">&#39;A&#39;</span><span class="p">,</span> <span class="s1">&#39;dog&#39;</span><span class="p">,</span> <span class="s1">&#39;small&#39;</span><span class="p">),</span>
<span class="gp">   ....: </span>                                    <span class="p">(</span><span class="s1">&#39;B&#39;</span><span class="p">,</span> <span class="s1">&#39;cat&#39;</span><span class="p">,</span> <span class="s1">&#39;big&#39;</span><span class="p">),</span>
<span class="gp">   ....: </span>                                    <span class="p">(</span><span class="s1">&#39;B&#39;</span><span class="p">,</span> <span class="s1">&#39;dog&#39;</span><span class="p">,</span> <span class="s1">&#39;small&#39;</span><span class="p">)]),</span>
<span class="gp">   ....: </span>                  <span class="n">columns</span><span class="o">=</span><span class="p">[</span><span class="s1">&#39;index_1&#39;</span><span class="p">,</span> <span class="s1">&#39;index_2&#39;</span><span class="p">])</span><span class="o">.</span><span class="n">T</span>
<span class="gp">   ....: </span>

<span class="gp">In [53]: </span><span class="n">df</span>
<span class="gh">Out[53]: </span>
<span class="go">           A          B      </span>
<span class="go">         cat   dog  cat   dog</span>
<span class="go">         big small  big small</span>
<span class="go">index_1  1.0   1.0  1.0   1.0</span>
<span class="go">index_2  1.0   1.0  1.0   1.0</span>

<span class="gp">In [54]: </span><span class="n">df</span><span class="o">.</span><span class="n">stack</span><span class="p">()</span>
<span class="gh">Out[54]: </span>
<span class="go">                 A         B     </span>
<span class="go">               cat  dog  cat  dog</span>
<span class="go">index_1 big    1.0  NaN  1.0  NaN</span>
<span class="go">        small  NaN  1.0  NaN  1.0</span>
<span class="go">index_2 big    1.0  NaN  1.0  NaN</span>
<span class="go">        small  NaN  1.0  NaN  1.0</span>

<span class="gp">In [55]: </span><span class="n">df</span><span class="o">.</span><span class="n">stack</span><span class="p">([</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">])</span>
<span class="gh">Out[55]: </span>
<span class="go">                     A    B</span>
<span class="go">index_1 cat big    1.0  1.0</span>
<span class="go">        dog small  1.0  1.0</span>
<span class="go">index_2 cat big    1.0  1.0</span>
<span class="go">        dog small  1.0  1.0</span>
</pre></div>
</div>
</section>
<section id="id4">
<h3>2. 聚合与变形的关系<a class="headerlink" href="#id4" title="Permalink to this heading">#</a></h3>
<p>在上面介绍的所有函数中，除了带有聚合效果的 <code class="docutils literal notranslate"><span class="pre">pivot_table</span></code> 以外，所有的函数在变形前后并不会带来 <code class="docutils literal notranslate"><span class="pre">values</span></code> 个数的改变，只是这些值在呈现的形式上发生了变化。在上一章讨论的分组聚合操作，由于生成了新的行列索引，因此必然也属于某种特殊的变形操作，但由于聚合之后把原来的多个值变为了一个值，因此 <code class="docutils literal notranslate"><span class="pre">values</span></code> 的个数产生了变化，这也是分组聚合与变形函数的最大区别。</p>
</section>
</section>
<section id="id5">
<h2>三、其他变形函数<a class="headerlink" href="#id5" title="Permalink to this heading">#</a></h2>
<section id="crosstab">
<h3>1. crosstab<a class="headerlink" href="#crosstab" title="Permalink to this heading">#</a></h3>
<p><code class="docutils literal notranslate"><span class="pre">crosstab</span></code> 是一个地位尴尬的函数，因为它能实现的所有功能 <code class="docutils literal notranslate"><span class="pre">pivot_table</span></code> 都能完成。在默认状态下， <code class="docutils literal notranslate"><span class="pre">crosstab</span></code> 可以统计元素组合出现的频数，即 <code class="docutils literal notranslate"><span class="pre">count</span></code> 操作。例如统计 <code class="docutils literal notranslate"><span class="pre">learn_pandas</span></code> 数据集中学校和转系情况对应的频数：</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [56]: </span><span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s1">&#39;data/learn_pandas.csv&#39;</span><span class="p">)</span>

<span class="gp">In [57]: </span><span class="n">pd</span><span class="o">.</span><span class="n">crosstab</span><span class="p">(</span><span class="n">index</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">School</span><span class="p">,</span> <span class="n">columns</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">Transfer</span><span class="p">)</span>
<span class="gh">Out[57]: </span>
<span class="go">Transfer                        N  Y</span>
<span class="go">School                              </span>
<span class="go">Fudan University               38  1</span>
<span class="go">Peking University              28  2</span>
<span class="go">Shanghai Jiao Tong University  53  0</span>
<span class="go">Tsinghua University            62  4</span>
</pre></div>
</div>
<p>这等价于如下 <code class="docutils literal notranslate"><span class="pre">crosstab</span></code> 的如下写法，这里的 <code class="docutils literal notranslate"><span class="pre">aggfunc</span></code> 即聚合参数：</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [58]: </span><span class="n">pd</span><span class="o">.</span><span class="n">crosstab</span><span class="p">(</span><span class="n">index</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">School</span><span class="p">,</span> <span class="n">columns</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">Transfer</span><span class="p">,</span>
<span class="gp">   ....: </span>            <span class="n">values</span> <span class="o">=</span> <span class="p">[</span><span class="mi">0</span><span class="p">]</span><span class="o">*</span><span class="n">df</span><span class="o">.</span><span class="n">shape</span><span class="p">[</span><span class="mi">0</span><span class="p">],</span> <span class="n">aggfunc</span> <span class="o">=</span> <span class="s1">&#39;count&#39;</span><span class="p">)</span>
<span class="gp">   ....: </span>
<span class="gh">Out[58]: </span>
<span class="go">Transfer                          N    Y</span>
<span class="go">School                                  </span>
<span class="go">Fudan University               38.0  1.0</span>
<span class="go">Peking University              28.0  2.0</span>
<span class="go">Shanghai Jiao Tong University  53.0  NaN</span>
<span class="go">Tsinghua University            62.0  4.0</span>
</pre></div>
</div>
<p>同样，可以利用 <code class="docutils literal notranslate"><span class="pre">pivot_table</span></code> 进行等价操作，由于这里统计的是组合的频数，因此 <code class="docutils literal notranslate"><span class="pre">values</span></code> 参数无论传入哪一个列都不会影响最后的结果：</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [59]: </span><span class="n">df</span><span class="o">.</span><span class="n">pivot_table</span><span class="p">(</span><span class="n">index</span> <span class="o">=</span> <span class="s1">&#39;School&#39;</span><span class="p">,</span>
<span class="gp">   ....: </span>               <span class="n">columns</span> <span class="o">=</span> <span class="s1">&#39;Transfer&#39;</span><span class="p">,</span>
<span class="gp">   ....: </span>               <span class="n">values</span> <span class="o">=</span> <span class="s1">&#39;Name&#39;</span><span class="p">,</span>
<span class="gp">   ....: </span>               <span class="n">aggfunc</span> <span class="o">=</span> <span class="s1">&#39;count&#39;</span><span class="p">)</span>
<span class="gp">   ....: </span>
<span class="gh">Out[59]: </span>
<span class="go">Transfer                          N    Y</span>
<span class="go">School                                  </span>
<span class="go">Fudan University               38.0  1.0</span>
<span class="go">Peking University              28.0  2.0</span>
<span class="go">Shanghai Jiao Tong University  53.0  NaN</span>
<span class="go">Tsinghua University            62.0  4.0</span>
</pre></div>
</div>
<p>从上面可以看出这两个函数的区别在于， <code class="docutils literal notranslate"><span class="pre">crosstab</span></code> 的对应位置传入的是具体的序列，而 <code class="docutils literal notranslate"><span class="pre">pivot_table</span></code> 传入的是被调用表对应的名字，若传入序列对应的值则会报错。</p>
<p>除了默认状态下的 <code class="docutils literal notranslate"><span class="pre">count</span></code> 统计，所有的聚合字符串和返回标量的自定义函数都是可用的，例如统计对应组合的身高均值：</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [60]: </span><span class="n">pd</span><span class="o">.</span><span class="n">crosstab</span><span class="p">(</span><span class="n">index</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">School</span><span class="p">,</span> <span class="n">columns</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">Transfer</span><span class="p">,</span>
<span class="gp">   ....: </span>            <span class="n">values</span> <span class="o">=</span> <span class="n">df</span><span class="o">.</span><span class="n">Height</span><span class="p">,</span> <span class="n">aggfunc</span> <span class="o">=</span> <span class="s1">&#39;mean&#39;</span><span class="p">)</span>
<span class="gp">   ....: </span>
<span class="gh">Out[60]: </span>
<span class="go">Transfer                                N       Y</span>
<span class="go">School                                           </span>
<span class="go">Fudan University               162.043750  177.20</span>
<span class="go">Peking University              163.429630  162.40</span>
<span class="go">Shanghai Jiao Tong University  163.953846     NaN</span>
<span class="go">Tsinghua University            163.253571  164.55</span>
</pre></div>
</div>
</section>
<section id="explode">
<h3>2. explode<a class="headerlink" href="#explode" title="Permalink to this heading">#</a></h3>
<p><code class="docutils literal notranslate"><span class="pre">explode</span></code> 参数能够对某一列的元素进行纵向的展开，被展开的单元格必须存储 <code class="docutils literal notranslate"><span class="pre">list,</span> <span class="pre">tuple,</span> <span class="pre">Series,</span> <span class="pre">np.ndarray</span></code> 中的一种类型。</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [61]: </span><span class="n">df_ex</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span><span class="s1">&#39;A&#39;</span><span class="p">:</span> <span class="p">[[</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">],</span>
<span class="gp">   ....: </span>                         <span class="s1">&#39;my_str&#39;</span><span class="p">,</span>
<span class="gp">   ....: </span>                         <span class="p">{</span><span class="mi">1</span><span class="p">,</span> <span class="mi">2</span><span class="p">},</span>
<span class="gp">   ....: </span>                         <span class="n">pd</span><span class="o">.</span><span class="n">Series</span><span class="p">([</span><span class="mi">3</span><span class="p">,</span> <span class="mi">4</span><span class="p">])],</span>
<span class="gp">   ....: </span>                      <span class="s1">&#39;B&#39;</span><span class="p">:</span> <span class="mi">1</span><span class="p">})</span>
<span class="gp">   ....: </span>

<span class="gp">In [62]: </span><span class="n">df_ex</span><span class="o">.</span><span class="n">explode</span><span class="p">(</span><span class="s1">&#39;A&#39;</span><span class="p">)</span>
<span class="gh">Out[62]: </span>
<span class="go">        A  B</span>
<span class="go">0       1  1</span>
<span class="go">0       2  1</span>
<span class="go">1  my_str  1</span>
<span class="go">2       1  1</span>
<span class="go">2       2  1</span>
<span class="go">3       3  1</span>
<span class="go">3       4  1</span>
</pre></div>
</div>
</section>
<section id="get-dummies">
<h3>3. get_dummies<a class="headerlink" href="#get-dummies" title="Permalink to this heading">#</a></h3>
<p><code class="docutils literal notranslate"><span class="pre">get_dummies</span></code> 是用于特征构建的重要函数之一，其作用是把类别特征转为指示变量。例如，对年级一列转为指示变量，属于某一个年级的对应列标记为1，否则为0：</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [63]: </span><span class="n">pd</span><span class="o">.</span><span class="n">get_dummies</span><span class="p">(</span><span class="n">df</span><span class="o">.</span><span class="n">Grade</span><span class="p">)</span><span class="o">.</span><span class="n">head</span><span class="p">()</span>
<span class="gh">Out[63]: </span>
<span class="go">   Freshman  Junior  Senior  Sophomore</span>
<span class="go">0         1       0       0          0</span>
<span class="go">1         1       0       0          0</span>
<span class="go">2         0       0       1          0</span>
<span class="go">3         0       0       0          1</span>
<span class="go">4         0       0       0          1</span>
</pre></div>
</div>
</section>
</section>
<section id="id6">
<h2>四、练习<a class="headerlink" href="#id6" title="Permalink to this heading">#</a></h2>
<section id="ex1">
<h3>Ex1：美国非法药物数据集<a class="headerlink" href="#ex1" title="Permalink to this heading">#</a></h3>
<p>现有一份关于美国非法药物的数据集，其中 <code class="docutils literal notranslate"><span class="pre">SubstanceName,</span> <span class="pre">DrugReports</span></code> 分别指药物名称和报告数量：</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [64]: </span><span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">read_csv</span><span class="p">(</span><span class="s1">&#39;data/drugs.csv&#39;</span><span class="p">)</span><span class="o">.</span><span class="n">sort_values</span><span class="p">([</span>
<span class="gp">   ....: </span>     <span class="s1">&#39;State&#39;</span><span class="p">,</span><span class="s1">&#39;COUNTY&#39;</span><span class="p">,</span><span class="s1">&#39;SubstanceName&#39;</span><span class="p">],</span><span class="n">ignore_index</span><span class="o">=</span><span class="kc">True</span><span class="p">)</span>
<span class="gp">   ....: </span>

<span class="gp">In [65]: </span><span class="n">df</span><span class="o">.</span><span class="n">head</span><span class="p">(</span><span class="mi">3</span><span class="p">)</span>
<span class="gh">Out[65]: </span>
<span class="go">   YYYY State COUNTY  SubstanceName  DrugReports</span>
<span class="go">0  2011    KY  ADAIR  Buprenorphine            3</span>
<span class="go">1  2012    KY  ADAIR  Buprenorphine            5</span>
<span class="go">2  2013    KY  ADAIR  Buprenorphine            4</span>
</pre></div>
</div>
<ol class="arabic simple">
<li><p>将数据转为如下的形式：</p></li>
</ol>
<a class="reference internal image-reference" href="../_images/Ex5_1.png"><img alt="../_images/Ex5_1.png" class="align-center" src="../_images/Ex5_1.png" style="width: 637.0px; height: 173.0px;" /></a>
<ol class="arabic simple" start="2">
<li><p>将第1问中的结果恢复为原表。</p></li>
<li><p>按 <code class="docutils literal notranslate"><span class="pre">State</span></code> 分别统计每年的报告数量总和，其中 <code class="docutils literal notranslate"><span class="pre">State,</span> <span class="pre">YYYY</span></code> 分别为列索引和行索引，要求分别使用 <code class="docutils literal notranslate"><span class="pre">pivot_table</span></code> 函数与 <code class="docutils literal notranslate"><span class="pre">groupby+unstack</span></code> 两种不同的策略实现，并体会它们之间的联系。</p></li>
</ol>
</section>
<section id="ex2-wide-to-long">
<h3>Ex2：特殊的wide_to_long方法<a class="headerlink" href="#ex2-wide-to-long" title="Permalink to this heading">#</a></h3>
<p>从功能上看， <code class="docutils literal notranslate"><span class="pre">melt</span></code> 方法应当属于 <code class="docutils literal notranslate"><span class="pre">wide_to_long</span></code> 的一种特殊情况，即 <code class="docutils literal notranslate"><span class="pre">stubnames</span></code> 只有一类。请使用 <code class="docutils literal notranslate"><span class="pre">wide_to_long</span></code> 生成 <code class="docutils literal notranslate"><span class="pre">melt</span></code> 一节中的 <code class="docutils literal notranslate"><span class="pre">df_melted</span></code> 。（提示：对列名增加适当的前缀）</p>
<div class="highlight-ipython notranslate"><div class="highlight"><pre><span></span><span class="gp">In [66]: </span><span class="n">df</span> <span class="o">=</span> <span class="n">pd</span><span class="o">.</span><span class="n">DataFrame</span><span class="p">({</span><span class="s1">&#39;Class&#39;</span><span class="p">:[</span><span class="mi">1</span><span class="p">,</span><span class="mi">2</span><span class="p">],</span>
<span class="gp">   ....: </span>                  <span class="s1">&#39;Name&#39;</span><span class="p">:[</span><span class="s1">&#39;San Zhang&#39;</span><span class="p">,</span> <span class="s1">&#39;Si Li&#39;</span><span class="p">],</span>
<span class="gp">   ....: </span>                  <span class="s1">&#39;Chinese&#39;</span><span class="p">:[</span><span class="mi">80</span><span class="p">,</span> <span class="mi">90</span><span class="p">],</span>
<span class="gp">   ....: </span>                  <span class="s1">&#39;Math&#39;</span><span class="p">:[</span><span class="mi">80</span><span class="p">,</span> <span class="mi">75</span><span class="p">]})</span>
<span class="gp">   ....: </span>

<span class="gp">In [67]: </span><span class="n">df</span>
<span class="gh">Out[67]: </span>
<span class="go">   Class       Name  Chinese  Math</span>
<span class="go">0      1  San Zhang       80    80</span>
<span class="go">1      2      Si Li       90    75</span>
</pre></div>
</div>
</section>
</section>
</section>


              </article>
              

              
          </div>
          
      </div>
    </div>

  
  
  <!-- Scripts loaded after <body> so the DOM is not blocked -->
  <script src="../_static/scripts/pydata-sphinx-theme.js?digest=92025949c220c2e29695"></script>

<footer class="bd-footer"><div class="bd-footer__inner container">
  
  <div class="footer-item">
    <p class="copyright">
    &copy; Copyright 2020-2022, Datawhale, 耿远昊.<br>
</p>
  </div>
  
  <div class="footer-item">
    <p class="sphinx-version">
Created using <a href="http://sphinx-doc.org/">Sphinx</a> 5.0.2.<br>
</p>
  </div>
  
</div>
</footer>
  </body>
</html>